#read in the data
library(readxl)
attritiondata <- read_excel(path = "Attrition_Data_Col_Renamed.xlsx", sheet = "HR-employee-attrition Data")
df <- read_excel(path = "Attrition_Data_Col_Renamed.xlsx", sheet = "HR-employee-attrition Data")
Question 3
#An's code for Q3
#3a. Age between 18 and 60, no children under 18 and no obvious age outliers.
summary(attritiondata$Age)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 18.00 30.00 36.00 36.92 43.00 60.00
#3a. None are labeled as under 18.
attritiondata$Over18[attritiondata$Over18 == 'N']
## character(0)
#My code for Q3
#3c. Give the frequencies (in table format or similar) for Gender, Education, and Occupation. They can be separate tables, if that’s your choice.
#Needs some refining
gendertable <- table(attritiondata$Gender)
gendertable
##
## Female Male
## 588 882
educationtable <- table(attritiondata$Education)
educationtable
##
## 1 2 3 4 5
## 170 282 572 398 48
occupationtable <- table(attritiondata$JobRole)
occupationtable
##
## Healthcare Representative Human Resources
## 131 52
## Laboratory Technician Manager
## 259 102
## Manufacturing Director Research Director
## 145 80
## Research Scientist Sales Executive
## 292 326
## Sales Representative
## 83
#3d. Give the counts (again, table) of management positions.
#Needs some refining
library(plyr)
management <- count(attritiondata$JobRole)
management <- management[management$x=="Manager",]
Question 4
library(ggplot2)
#An's code for Q4b
#4b there seemed to be no relationship between age and MonthlyRate, DailyRate, or HourlyRate
##Monthly Rate
ggplot(data = df, aes(x = df$Age, y = df$MonthlyRate)) + geom_point(aes(colour = factor(Gender))) + geom_smooth(method = "lm", aes(group = Gender, colour = Gender)) + labs(title = "Montly Rate vs Age", x = "Age", y = "Monthly Rate", color = "Gender")

test <- lm(df$MonthlyRate ~ df$Age)
summary(test)
##
## Call:
## lm(formula = df$MonthlyRate ~ df$Age)
##
## Residuals:
## Min 1Q Median 3Q Max
## -12452 -6193 -45 6111 13056
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 13506.10 773.19 17.468 <2e-16 ***
## df$Age 21.86 20.33 1.075 0.282
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7117 on 1468 degrees of freedom
## Multiple R-squared: 0.0007869, Adjusted R-squared: 0.0001062
## F-statistic: 1.156 on 1 and 1468 DF, p-value: 0.2825
##Daily Rate
ggplot(data = df, aes(x = df$Age, y = df$DailyRate)) + geom_point(aes(colour = factor(Gender))) + geom_smooth(method = "lm", aes(group = Gender, colour = Gender)) + labs(title = "Daily Rate vs Age", x = "Age", y = "Daily Rate", color = "Gender")

test <- lm(df$DailyRate ~ df$Age)
summary(test)
##
## Call:
## lm(formula = df$DailyRate ~ df$Age)
##
## Residuals:
## Min 1Q Median 3Q Max
## -708.06 -337.55 -0.61 355.66 697.72
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 785.0985 43.8469 17.905 <2e-16 ***
## df$Age 0.4709 1.1528 0.408 0.683
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 403.6 on 1468 degrees of freedom
## Multiple R-squared: 0.0001137, Adjusted R-squared: -0.0005675
## F-statistic: 0.1669 on 1 and 1468 DF, p-value: 0.683
##Hourly Rate
ggplot(data = df, aes(x = df$Age, y = df$HourlyRate)) + geom_point(aes(colour = factor(Gender))) + geom_smooth(method = "lm", aes(group = Gender, colour = Gender)) + labs(title = "Hourly Rate vs Age", x = "Age", y = "Hourly Rate", color = "Gender")

test <- lm(df$HourlyRate ~ df$Age)
summary(test)
##
## Call:
## lm(formula = df$HourlyRate ~ df$Age)
##
## Residuals:
## Min 1Q Median 3Q Max
## -36.868 -17.517 0.064 17.483 35.078
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 63.89557 2.20855 28.931 <2e-16 ***
## df$Age 0.05405 0.05806 0.931 0.352
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 20.33 on 1468 degrees of freedom
## Multiple R-squared: 0.0005898, Adjusted R-squared: -9.096e-05
## F-statistic: 0.8664 on 1 and 1468 DF, p-value: 0.3521
#4b. MonthlyInco shows correlation with Age. Tried linear regression on untransformed data and log, reciprocal, square root transformations
## untransformed
ggplot(data = df, aes(x = Age, y = MonthlyInco)) + geom_point(aes(colour = Gender)) + geom_smooth(method = 'lm', aes(group = Gender, colour = Gender)) + labs(title = "Montly Income vs Age", x = "Age", y = "Monthly Income", color = "Gender")

## log transformed
ggplot(data = df, aes(x = Age, y = log(MonthlyInco))) + geom_point(aes(colour = Gender)) + geom_smooth(method = 'lm', aes(group = Gender, colour = Gender)) + labs(title = "Montly Income vs Age", x = "Age", y = "log(Monthly Income)", color = "Gender", subtitle = "log transformed")

## reciprocal transformed
ggplot(data = df, aes(x = Age, y = -1/MonthlyInco)) + geom_point(aes(colour = Gender)) + geom_smooth(method = 'lm', aes(group = Gender, colour = Gender)) + labs(title = "Montly Income vs Age", x = "Age", y = "-1/(Monthly Income)", color = "Gender", subtitle = "negative reciprocal transformed")

## square root transformed
ggplot(data = df, aes(x = Age, y = sqrt(MonthlyInco))) + geom_point(aes(colour = Gender)) + geom_smooth(method = 'lm', aes(group = Gender, colour = Gender)) + labs(title = "Montly Income vs Age", x = "Age", y = "sqrt(Monthly Income)", color = "Gender", subtitle = "square root transformed")

#4b. Transformation did not yield better fit, fit test performed on most uncomplicated model, untransformed data
test <- lm(df$MonthlyInco ~ df$Age, subset = df$Gender == 'Male')
summary(test)
##
## Call:
## lm(formula = df$MonthlyInco ~ df$Age, subset = df$Gender == "Male")
##
## Residuals:
## Min 1Q Median 3Q Max
## -9940.2 -2524.5 -603.7 1659.1 12593.3
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3028.8 577.6 -5.244 1.97e-07 ***
## df$Age 256.7 15.3 16.779 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4106 on 880 degrees of freedom
## Multiple R-squared: 0.2424, Adjusted R-squared: 0.2415
## F-statistic: 281.5 on 1 and 880 DF, p-value: < 2.2e-16
test <- lm(df$MonthlyInco ~ df$Age, subset = df$Gender == 'Female')
summary(test)
##
## Call:
## lm(formula = df$MonthlyInco ~ df$Age, subset = df$Gender == "Female")
##
## Residuals:
## Min 1Q Median 3Q Max
## -9558.6 -2686.3 -783.7 1990.1 12347.8
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2860.33 695.08 -4.115 4.42e-05 ***
## df$Age 255.74 18.07 14.151 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4057 on 586 degrees of freedom
## Multiple R-squared: 0.2547, Adjusted R-squared: 0.2534
## F-statistic: 200.3 on 1 and 586 DF, p-value: < 2.2e-16
An EDA
ggplot(df, aes(x = OverTime, fill = Attrition)) + geom_bar(position = "fill") + labs(title = "Over Time", x = "Over Time", y = "", color = "")+ scale_y_continuous(labels = scales::percent)

ggplot(df, aes(x = StockOptLvl, fill = Attrition)) + geom_bar(position = "fill") + labs(title = "Stock Option Level", x = "Stock Option Levels", y = "", color = "")+ scale_y_continuous(labels = scales::percent)

ggplot(df, aes(x = HourlyRate, fill = Attrition)) + geom_histogram(position = "fill", bins = 30) + labs(title = "Hourly Rate", x = "Hourly Rate", y = "", color = "")+ scale_y_continuous(labels = scales::percent)

ggplot(df, aes(x = DailyRate, fill = Attrition)) + geom_histogram(position = "fill", bins = 30) + labs(title = "Daily Rate", x = "Daily Rate", y = "", color = "")+ scale_y_continuous(labels = scales::percent)

ggplot(df, aes(x = MonthlyRate, fill = Attrition)) + geom_histogram(position = "fill", bins = 30) + labs(title = "Monthly Rate", x = "Monthly Rate", y = "", color = "")+ scale_y_continuous(labels = scales::percent)

ggplot(df, aes(x = MonthlyInco, fill = Attrition)) + geom_histogram(position = "fill", bins = 30) + labs(title = "Monthly Income", x = "Monthly Income", y = "", color = "")+ scale_y_continuous(labels = scales::percent)

ggplot(df, aes(x = PctSalaryInc, fill = Attrition)) + geom_bar(position = "fill") + labs(title = "Percent Salary Increase", x = "Salary Increase (%)", y = "", color = "")+ scale_y_continuous(labels = scales::percent)

Tori EDA
#Tori EDA
#Gender
ggplot(attritiondata, aes(x = Gender, fill = Attrition)) + geom_bar(position = "fill") + labs(title = "Gender and Attrition", x = "Gender", y = "", color = "")+ scale_y_continuous(labels = scales::percent)

#There is a very high p-value for this regression
genderlmdata <- attritiondata
genderlmdata$Gender[genderlmdata$Gender=="Female"] <- 0
genderlmdata$Gender[genderlmdata$Gender=="Male"] <- 1
genderlm <- lm(genderlmdata$Gender ~ genderlmdata$Attrition)
summary(genderlm)
##
## Call:
## lm(formula = genderlmdata$Gender ~ genderlmdata$Attrition)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.6329 -0.5937 0.3671 0.4063 0.4063
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.59367 0.01396 42.542 <2e-16 ***
## genderlmdata$AttritionYes 0.03924 0.03475 1.129 0.259
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.49 on 1468 degrees of freedom
## Multiple R-squared: 0.0008675, Adjusted R-squared: 0.0001869
## F-statistic: 1.275 on 1 and 1468 DF, p-value: 0.2591
#Age
ggplot(attritiondata, aes(x = Age, fill = Attrition)) + geom_bar(position = "fill") + labs(title = "Age and Attrition", x = "Age", y = "", color = "")+ scale_y_continuous(labels = scales::percent)

#There is a very low p-value for this regression
agelm <- lm(attritiondata$Age ~ attritiondata$Attrition)
summary(agelm)
##
## Call:
## lm(formula = attritiondata$Age ~ attritiondata$Attrition)
##
## Residuals:
## Min 1Q Median 3Q Max
## -19.561 -6.561 -1.561 5.439 24.392
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 37.5612 0.2569 146.191 < 2e-16 ***
## attritiondata$AttritionYes -3.9536 0.6399 -6.179 8.36e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9.022 on 1468 degrees of freedom
## Multiple R-squared: 0.02535, Adjusted R-squared: 0.02468
## F-statistic: 38.18 on 1 and 1468 DF, p-value: 8.356e-10
#DistanceFromHome
ggplot(attritiondata, aes(x = DistFromHome, fill = Attrition)) + geom_bar(position = "fill") + labs(title = "Distance from Home and Attrition", x = "Distance from Home", y = "", color = "")+ scale_y_continuous(labels = scales::percent)

#There is a medium p-value for this regression
distancelm <- lm(attritiondata$DistFromHome ~ attritiondata$Attrition)
summary(distancelm)
##
## Call:
## lm(formula = attritiondata$DistFromHome ~ attritiondata$Attrition)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.633 -6.916 -1.916 4.367 20.084
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.9157 0.2302 38.722 < 2e-16 ***
## attritiondata$AttritionYes 1.7173 0.5734 2.995 0.00279 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.085 on 1468 degrees of freedom
## Multiple R-squared: 0.006072, Adjusted R-squared: 0.005395
## F-statistic: 8.968 on 1 and 1468 DF, p-value: 0.002793
#EnvironmentSatisfaction
ggplot(attritiondata, aes(x = EnvrSatIndex, fill = Attrition)) +
geom_bar(position = "fill") +
labs(title = "Environment Satisfaction and Attrition") +
scale_y_continuous(labels = scales::percent) +
scale_x_continuous("Environment Satisfaction Index", breaks = c(1,2,3,4), labels=c("1 Low", "2 Medium","3 High", "4 Very High"))

#There is a very small p-value on this regression
environmentlm <- lm(attritiondata$EnvrSatIndex ~ attritiondata$Attrition)
summary(environmentlm)
##
## Call:
## lm(formula = attritiondata$EnvrSatIndex ~ attritiondata$Attrition)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.7713 -0.7713 0.2287 1.2287 1.5359
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.77129 0.03097 89.474 < 2e-16 ***
## attritiondata$AttritionYes -0.30715 0.07714 -3.982 7.17e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.088 on 1468 degrees of freedom
## Multiple R-squared: 0.01069, Adjusted R-squared: 0.01001
## F-statistic: 15.86 on 1 and 1468 DF, p-value: 7.172e-05
#JobSatisfaction
ggplot(attritiondata, aes(x = JobSatIndex, fill = Attrition)) +
geom_bar(position = "fill") +
labs(title = "Job Satisfaction and Attrition") +
scale_y_continuous(labels = scales::percent) +
scale_x_continuous("Job Satisfaction Index", breaks = c(1,2,3,4), labels=c("1 Low", "2 Medium","3 High", "4 Very High"))

#There is a very small p-value on this regression
joblm <- lm(attritiondata$JobSatIndex ~ attritiondata$Attrition)
summary(joblm)
##
## Call:
## lm(formula = attritiondata$JobSatIndex ~ attritiondata$Attrition)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.7786 -0.7786 0.2214 1.2214 1.5316
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.77859 0.03125 88.916 < 2e-16 ***
## attritiondata$AttritionYes -0.31023 0.07783 -3.986 7.04e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.097 on 1468 degrees of freedom
## Multiple R-squared: 0.01071, Adjusted R-squared: 0.01003
## F-statistic: 15.89 on 1 and 1468 DF, p-value: 7.043e-05
#MaritalStatus
ggplot(attritiondata, aes(x = MaritalState, fill = Attrition)) + geom_bar(position = "fill") + labs(title = "Marital Status and Attrition", x = "Marital Status", y = "", color = "")+ scale_y_continuous(labels = scales::percent)

#RelationshipSatisfaction
ggplot(attritiondata, aes(x = RelpSatIndex, fill = Attrition)) +
geom_bar(position = "fill") +
labs(title = "Relationship Satisfaction and Attrition") +
scale_y_continuous(labels = scales::percent) +
scale_x_continuous("Relationship Satisfaction Index", breaks = c(1,2,3,4), labels=c("1 Low", "2 Medium","3 High", "4 Very High"))

#There is a medium to large p-value on this regression
relationshiplm <- lm(attritiondata$RelpSatIndex ~ attritiondata$Attrition)
summary(relationshiplm)
##
## Call:
## lm(formula = attritiondata$RelpSatIndex ~ attritiondata$Attrition)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.734 -0.734 0.266 1.266 1.401
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.73398 0.03077 88.854 <2e-16 ***
## attritiondata$AttritionYes -0.13483 0.07663 -1.759 0.0787 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.08 on 1468 degrees of freedom
## Multiple R-squared: 0.002104, Adjusted R-squared: 0.001425
## F-statistic: 3.096 on 1 and 1468 DF, p-value: 0.07871
#WorkLifeBalance
ggplot(attritiondata, aes(x = WorkLifeFit, fill = Attrition)) +
geom_bar(position = "fill") +
labs(title = "Work-Life Balance and Attrition") +
scale_y_continuous(labels = scales::percent) +
scale_x_continuous("Work-Life Balance Index", breaks = c(1,2,3,4), labels=c("1 Bad", "2 Good","3 Better", "4 Best"))

#There is a medium to large p-value on this regression
worklifelm <- lm(attritiondata$WorkLifeFit ~ attritiondata$Attrition)
summary(worklifelm)
##
## Call:
## lm(formula = attritiondata$WorkLifeFit ~ attritiondata$Attrition)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.7810 -0.6582 0.2190 0.2190 1.3418
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.78102 0.02009 138.462 <2e-16 ***
## attritiondata$AttritionYes -0.12279 0.05002 -2.455 0.0142 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7053 on 1468 degrees of freedom
## Multiple R-squared: 0.004088, Adjusted R-squared: 0.00341
## F-statistic: 6.026 on 1 and 1468 DF, p-value: 0.01421